rr library(readxl) library(readr) library(dplyr) library(here) library(tidyverse) library(stringr)
rr sheet1 <- read_xlsx(here(_data/boing-boing-candy-2015.xlsx)) sheet1
rr dim(sheet1) colnames(sheet1) glimpse(sheet1)
rr sheet1_long <- sheet1 %>% pivot_longer(cols = starts_with([), #converting rows to columns names_to = , values_to = )
#check dimensions, column names dim(sheet1_long) colnames(sheet1_long)
#Omitting columns that are not required
rr sheet1_long <- sheet1_long %>% select(-c(4:29))
colnames(sheet1_long) dim(sheet1_long)
rr colnames(sheet1_long) <- c(, , _out, , ) colnames(sheet1_long)
[1] \timestamp\ \age\ \going_out\ \candy\ \rating\
#Add column ‘year’
rr sheet1_long <- sheet1_long %>% add_column(year = 2015) head(sheet1_long)
rr sheet2 <- read_xlsx(here(_data/boing-boing-candy-2016.xlsx)) sheet2
rr dim(sheet2) colnames(sheet2) glimpse(sheet2)
rr sheet2_long <- sheet2 %>% pivot_longer(cols = starts_with([), #converting rows to columns names_to = , values_to = )
dim(sheet2_long) colnames(sheet2_long)
#Omitting columns that are not required
rr sheet2_long <- sheet2_long %>% select(-c(6:22))
colnames(sheet2_long)
[1] \Timestamp\ \Are you going actually going trick or treating yourself?\
[3] \Your gender:\ \How old are you?\
[5] \Which country do you live in?\ \candy\
[7] \rating\
rr dim(sheet2_long)
[1] 127159 7
rr colnames(sheet2_long) <- c(, _out, , , , , ) colnames(sheet2_long)
[1] \timestamp\ \going_out\ \gender\ \age\ \country\ \candy\ \rating\
#Add column ‘year’
rr sheet2_long <- sheet2_long %>% add_column(year = 2016) head(sheet2_long) r distinct(sheet2_long, country)
rr sheet3 <- read_xlsx(here(_data/boing-boing-candy-2017.xlsx))
New names:
* `` -> ...114
rr sheet3
rr dim(sheet3) colnames(sheet3) glimpse(sheet3)
rr sheet3_long <- sheet3 %>% pivot_longer(cols = starts_with(6), #converting rows to columns names_to = , values_to = ) colnames(sheet3_long)
[1] \Internal ID\ \Q1: GOING OUT?\ \Q2: GENDER\ \Q3: AGE\
[5] \Q4: COUNTRY\ \Q5: STATE
#Omitting columns that are not required
rr sheet3_long <- sheet3_long %>% select(-c(6:17)) colnames(sheet3_long)
[1] \Internal ID\ \Q1: GOING OUT?\ \Q2: GENDER\ \Q3: AGE\ \Q4: COUNTRY\ \candy\ \rating\
rr dim(sheet3_long)
[1] 253380 7
#Change col names
rr colnames(sheet3_long) <- c(_id, _out, , , , , ) colnames(sheet3_long)
[1] \internal_id\ \going_out\ \gender\ \age\ \country\ \candy\ \rating\
rr sheet3_long r #summarise((count = sum(is.na(rating))))
#Add column ‘year’
rr sheet3_long <- sheet3_long %>% add_column(year = 2017) head(sheet3_long) r distinct(sheet3_long, country)
#X X X X X X X X X X X X X X X X X X X X X X X X X X X X X X
rr
candy_combined <- bind_rows(sheet1_long, sheet2_long, sheet3_long)
colnames(candy_combined)
[1] \timestamp\ \age\ \going_out\ \candy\ \rating\ \year\ \gender\ \country\ \internal_id\
rr dim(candy_combined)
[1] 915389 9
rr head(candy_combined)
#check
rr distinct(candy_combined, going_out)
#Changing column data type
rr candy_combined\(age <- as.integer(candy_combined\)age)
NAs introduced by coercionNAs introduced by coercion to integer range
rr candy_combined\(year <- as.integer(candy_combined\)year) candy_combined\(internal_id <- as.integer(candy_combined\)internal_id) head(candy_combined) r NA
rr candy_combined <- candy_combined %>% filter(age <= 122)
length(candy_combined$age)
[1] 869160
rr distinct(candy_combined, age)
rr candy_combined <- candy_combined %>% mutate_if(is.character, str_to_lower)
head(candy_combined) r unique(candy_combined$country)
[1] NA
[2] \canada\
[3] \usa\
[4] \us\
[5] \uk\
[6] \united states of america\
[7] \japan\
[8] \united states\
[9] \france\
[10] \ussa\
[11] \u.s.a.\
[12] \england\
[13] \switzerland\
[14] \murica\
[15] \united kingdom\
[16] \neverland\
[17] \usa!\
[18] \korea\
[19] \u.s.\
[20] \america\
[21] \units states\
[22] \belgium\
[23] \croatia\
[24] \portugal\
[25] \usa usa usa\
[26] \the best one - usa\
[27] \usa! usa! usa!\
[28] \españa\
[29] \there isn't one for old men\
[30] \panama\
[31] \the yoo ess of aaayyyyyy\
[32] \united kindom\
[33] \hungary\
[34] \austria\
[35] \new zealand\
[36] \germany\
[37] \mexico\
[38] \australia\
[39] \brasil\
[40] \god's country\
[41] \south korea\
[42] \usa!!!!!!\
[43] \philippines\
[44] \eua\
[45] \usa! usa!\
[46] \sweden\
[47] \united sates\
[48] \the netherlands\
[49] \finland\
[50] \merica\
[51] \china\
[52] \kenya\
[53] \the republic of cascadia\
[54] \united stetes\
[55] \usa usa usa usa\
[56] \united states of america\
[57] \netherlands\
[58] \united state\
[59] \united staes\
[60] \uae\
[61] \usausausa\
[62] \unhinged states\
[63] \us of a\
[64] \unites states\
[65] \the united states\
[66] \north carolina\
[67] \unied states\
[68] \europe\
[69] \earth\
[70] \u s\
[71] \u.k.\
[72] \costa rica\
[73] \the united states of america\
[74] \unite states\
[75] \cascadia\
[76] \greece\
[77] \usa? hard to tell anymore..\
[78] \'merica\
[79] \usas\
[80] \pittsburgh\
[81] \a\
[82] \can\
[83] \canae\
[84] \new york\
[85] \trumpistan\
[86] \ireland\
[87] \california\
[88] \south africa\
[89] \i pretend to be from canada
rr country_distinct <- distinct(candy_combined, country)
rr candy_combined <- candy_combined %>% mutate(candy = str_extract(candy, \[^\\[\\]]+), candy = str_extract(candy, [^q6 |]+[^|]+))
#cleaning country column
rr canada_error <- c(, `) uk_error <- c(, , , k., kindom, kingdom) usa_error <- c('merica, , , , , . america, jersey, york, carolina, , united states, united states of america, yoo ess of aaayyyyyy, , , a, s., s.a.,
states, states, states, states of america, sates, staes, state, statea, stated, states, states of america, statss, stetes, ststes, states, states, , of a, usa usa, usa usa usa, usa usa!!!!, !, ! usa!, ! usa! usa!, !!!!!!, ? hard to tell anymore.., , , ,
) candy_combined <- candy_combined %>% mutate(country = ifelse(country %in% canada_error, , country), country = ifelse(country %in% uk_error, , country), country = ifelse(country %in% usa_error, , country))
country_distinct <- distinct(candy_combined, country)
candy_combined %>%
filter(candy != "cash, or other forms of legal tender")
#XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
1.4.2 Analysis questions 1. What is the total number of candy ratings given across the three years. (number of candy ratings, not number of raters. Don’t count missing values)
rr candy_combined %>% filter(!is.na(rating)) %>% summarise(total_rating_count = n()) r
#total_candy_ratings
rr candy_combined %>% select(going_out, age) %>% filter(!is.na(going_out)) %>% group_by(going_out) %>% summarise(average_age = mean(age))
`summarise()` ungrouping output (override with `.groups` argument)
rr candy_combined %>% select(candy, rating) %>% filter(!is.na(rating)) %>% group_by(rating, candy) %>% summarise(count = n()) %>% arrange(desc(count)) %>% slice(seq_len(1))
`summarise()` regrouping output by 'rating' (override with `.groups` argument)
rr distinct(candy_combined, candy)
rr candy_combined %>% select(candy, rating) %>% filter(candy == , rating == ) %>% group_by(rating) %>% summarise(starburst_count = n()) #%>%
`summarise()` ungrouping output (override with `.groups` argument)
rr #arrange(desc(count)) #%>% #slice(seq_len(3))
For the next three questions, count despair as -1, joy as +1 and meh as 0.
rr candy_combined %>% select(gender, candy, rating) %>% filter(!is.na(rating)) %>% mutate(new_rating = case_when(rating == ~ -1, rating == ~ 1, rating == ~ 0,)) %>% group_by(gender, candy) %>% summarise(count_popular = sum(new_rating)) %>% arrange(desc(count_popular)) %>% slice(seq_len(1))
`summarise()` regrouping output by 'gender' (override with `.groups` argument)
rr candy_combined %>% select(year, candy, rating) %>% filter(!is.na(rating)) %>% mutate(new_rating = case_when(rating == ~ -1, rating == ~ 1, rating == ~ 0,)) %>% group_by(year, candy) %>% summarise(popular_candy = sum(new_rating)) %>% arrange(desc(popular_candy)) %>% slice(seq_len(1))
`summarise()` regrouping output by 'year' (override with `.groups` argument)
TEST XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
rr sheet3_long %>% mutate(new_candy3= str_extract(candy, [^Q6 |]+[^|]+)) r new_candy3
Error: object 'new_candy3' not found
rr sheet2_long %>% mutate(new_candy2 = str_extract(candy, \[^\\[\\]]+)) r new_candy2
Error: object 'new_candy2' not found